import requests
from bs4 import BeautifulSoup
import csv
import time


# 请求网页
def page_def(url, ua):
    resp = requests.get(url, headers=ua)
    html = resp.content.decode('utf-8')
    return html


# 解析网页
def info_def(html):
    soup = BeautifulSoup(html, 'html.parser')
    title = soup('title')

    sentence = soup.select('div.left > div.sons > div.cont > a:nth-of-type(1)')
    poet = soup.select('div.left > div.sons > div.cont > a:nth-of-type(2)')

    sentence_list = []
    href_list = []

    for i in range(len(sentence)):
        if i < len(poet):  # Check if poet list has an element at index i
            temp = sentence[i].get_text() + "---" + poet[i].get_text()
            sentence_list.append(temp)
            href = sentence[i].get('href')
            href_list.append("https://so.gushiwen.org" + href)

    return [href_list, sentence_list]


# 写入CSV文件
def csv_def(info_list):
    with open(r'sentence.csv', 'a', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(["Sentence", "Poet"])
        for i in range(len(info_list[1])):
            writer.writerow([info_list[1][i].split('---')[0], info_list[1][i].split('---')[1]])


# 子网页处理函数：进入并解析子网页/请求子网页
def request_sub_page(info_list):
    subpage_urls = info_list[0]
    ua = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36 Edg/123.0.0.0'}
    sub_html = []

    for url in subpage_urls:
        html = page_def(url, ua)
        sub_html.append(html)

    return sub_html


# 子网页处理函数：解析子网页，爬取诗句内容
def sub_page_def(sub_html):
    poem_list = []

    for html in sub_html:
        soup = BeautifulSoup(html, 'html.parser')
        poem = soup.select('div.left > div.sons > div.cont > div.contson')

        if poem:
            poem = poem[0].get_text()
            poem_list.append(poem.strip())

    return poem_list


# 子网页处理函数：保存诗句到CSV
def sub_page_save(poem_list):
    with open(r'poems.csv', 'a', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(["Poem"])
        for poem in poem_list:
            writer.writerow([poem])


if __name__ == '__main__':
    print("**************开始古诗文网站爬虫********************")
    ua = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36 Edg/123.0.0.0'}

    for i in range(1, 6):
        this_i = str(i).strip()
        url = 'https://so.gushiwen.cn/mingjus/default.aspx?page=' + this_i + '&tstr=&astr=&cstr=&xstr='
        time.sleep(1)
        html = page_def(url, ua)
        info_list = info_def(html)
        csv_def(info_list)

        print("开始解析第%d" % (i) + "页")

        sub_html = request_sub_page(info_list)
        poem_list = sub_page_def(sub_html)
        sub_page_save(poem_list)

    print("****************爬取完成***********************")